Next steps

  • Investigate seasonality
  • Improve plotting of interpolated regions

Initial import

# Load libraries
library(RCurl)
library(tidyverse)
library(magrittr)
library(plotly)
library(imputeTS)
library(lubridate)
library(timetk)
library(ggcorrplot)
library(gridExtra)
# Load data (relative file path)
df <- read.csv("../data/energy_dataset.csv")

# Formatting time variable to datetime object
# NB:  Time contains both UTC+1 and UTC+2 time-zones - date functions automatically pick this up and convert to UTC

df$time <- as_datetime(df$time)
head(df)
dim(df) # Number of rows and columns
## [1] 35064    29
summary(df) # Distribution and missingness for each variable
##       time                     generation.biomass
##  Min.   :2014-12-31 23:00:00   Min.   :  0.0     
##  1st Qu.:2016-01-01 04:45:00   1st Qu.:333.0     
##  Median :2016-12-31 10:30:00   Median :367.0     
##  Mean   :2016-12-31 10:30:00   Mean   :383.5     
##  3rd Qu.:2017-12-31 16:15:00   3rd Qu.:433.0     
##  Max.   :2018-12-31 22:00:00   Max.   :592.0     
##                                NA's   :19        
##  generation.fossil.brown.coal.lignite generation.fossil.coal.derived.gas
##  Min.   :  0.0                        Min.   :0                         
##  1st Qu.:  0.0                        1st Qu.:0                         
##  Median :509.0                        Median :0                         
##  Mean   :448.1                        Mean   :0                         
##  3rd Qu.:757.0                        3rd Qu.:0                         
##  Max.   :999.0                        Max.   :0                         
##  NA's   :18                           NA's   :18                        
##  generation.fossil.gas generation.fossil.hard.coal generation.fossil.oil
##  Min.   :    0         Min.   :   0                Min.   :  0.0        
##  1st Qu.: 4126         1st Qu.:2527                1st Qu.:263.0        
##  Median : 4969         Median :4474                Median :300.0        
##  Mean   : 5623         Mean   :4256                Mean   :298.3        
##  3rd Qu.: 6429         3rd Qu.:5839                3rd Qu.:330.0        
##  Max.   :20034         Max.   :8359                Max.   :449.0        
##  NA's   :18            NA's   :18                  NA's   :19           
##  generation.fossil.oil.shale generation.fossil.peat generation.geothermal
##  Min.   :0                   Min.   :0              Min.   :0            
##  1st Qu.:0                   1st Qu.:0              1st Qu.:0            
##  Median :0                   Median :0              Median :0            
##  Mean   :0                   Mean   :0              Mean   :0            
##  3rd Qu.:0                   3rd Qu.:0              3rd Qu.:0            
##  Max.   :0                   Max.   :0              Max.   :0            
##  NA's   :18                  NA's   :18             NA's   :18           
##  generation.hydro.pumped.storage.aggregated
##  Mode:logical                              
##  NA's:35064                                
##                                            
##                                            
##                                            
##                                            
##                                            
##  generation.hydro.pumped.storage.consumption
##  Min.   :   0.0                             
##  1st Qu.:   0.0                             
##  Median :  68.0                             
##  Mean   : 475.6                             
##  3rd Qu.: 616.0                             
##  Max.   :4523.0                             
##  NA's   :19                                 
##  generation.hydro.run.of.river.and.poundage generation.hydro.water.reservoir
##  Min.   :   0.0                             Min.   :   0                    
##  1st Qu.: 637.0                             1st Qu.:1077                    
##  Median : 906.0                             Median :2164                    
##  Mean   : 972.1                             Mean   :2605                    
##  3rd Qu.:1250.0                             3rd Qu.:3757                    
##  Max.   :2000.0                             Max.   :9728                    
##  NA's   :19                                 NA's   :18                      
##  generation.marine generation.nuclear generation.other
##  Min.   :0         Min.   :   0       Min.   :  0.00  
##  1st Qu.:0         1st Qu.:5760       1st Qu.: 53.00  
##  Median :0         Median :6566       Median : 57.00  
##  Mean   :0         Mean   :6264       Mean   : 60.23  
##  3rd Qu.:0         3rd Qu.:7025       3rd Qu.: 80.00  
##  Max.   :0         Max.   :7117       Max.   :106.00  
##  NA's   :19        NA's   :17         NA's   :18      
##  generation.other.renewable generation.solar generation.waste
##  Min.   :  0.00             Min.   :   0     Min.   :  0.0   
##  1st Qu.: 73.00             1st Qu.:  71     1st Qu.:240.0   
##  Median : 88.00             Median : 616     Median :279.0   
##  Mean   : 85.64             Mean   :1433     Mean   :269.5   
##  3rd Qu.: 97.00             3rd Qu.:2578     3rd Qu.:310.0   
##  Max.   :119.00             Max.   :5792     Max.   :357.0   
##  NA's   :18                 NA's   :18       NA's   :19      
##  generation.wind.offshore generation.wind.onshore forecast.solar.day.ahead
##  Min.   :0                Min.   :    0           Min.   :   0            
##  1st Qu.:0                1st Qu.: 2933           1st Qu.:  69            
##  Median :0                Median : 4849           Median : 576            
##  Mean   :0                Mean   : 5464           Mean   :1439            
##  3rd Qu.:0                3rd Qu.: 7398           3rd Qu.:2636            
##  Max.   :0                Max.   :17436           Max.   :5836            
##  NA's   :18               NA's   :18                                      
##  forecast.wind.offshore.eday.ahead forecast.wind.onshore.day.ahead
##  Mode:logical                      Min.   :  237                  
##  NA's:35064                        1st Qu.: 2979                  
##                                    Median : 4855                  
##                                    Mean   : 5471                  
##                                    3rd Qu.: 7353                  
##                                    Max.   :17430                  
##                                                                   
##  total.load.forecast total.load.actual price.day.ahead   price.actual   
##  Min.   :18105       Min.   :18041     Min.   :  2.06   Min.   :  9.33  
##  1st Qu.:24794       1st Qu.:24808     1st Qu.: 41.49   1st Qu.: 49.35  
##  Median :28906       Median :28901     Median : 50.52   Median : 58.02  
##  Mean   :28712       Mean   :28697     Mean   : 49.87   Mean   : 57.88  
##  3rd Qu.:32263       3rd Qu.:32192     3rd Qu.: 60.53   3rd Qu.: 68.01  
##  Max.   :41390       Max.   :41015     Max.   :101.99   Max.   :116.80  
##                      NA's   :36

Missing data

There are eight columns that contain no values (all NAs or zeroes). These are removed here.

df <- df %>% 
  select(-c(generation.fossil.coal.derived.gas,
            generation.fossil.oil.shale,
            generation.fossil.peat,
            generation.geothermal,
            generation.marine,
            generation.hydro.pumped.storage.aggregated,
            generation.wind.offshore,
            forecast.wind.offshore.eday.ahead,
            total.load.actual)) # Removing this one too as we're not analysing it
colSums(is.na(df)) # Number of missing values for each remaining variable
##                                        time 
##                                           0 
##                          generation.biomass 
##                                          19 
##        generation.fossil.brown.coal.lignite 
##                                          18 
##                       generation.fossil.gas 
##                                          18 
##                 generation.fossil.hard.coal 
##                                          18 
##                       generation.fossil.oil 
##                                          19 
## generation.hydro.pumped.storage.consumption 
##                                          19 
##  generation.hydro.run.of.river.and.poundage 
##                                          19 
##            generation.hydro.water.reservoir 
##                                          18 
##                          generation.nuclear 
##                                          17 
##                            generation.other 
##                                          18 
##                  generation.other.renewable 
##                                          18 
##                            generation.solar 
##                                          18 
##                            generation.waste 
##                                          19 
##                     generation.wind.onshore 
##                                          18 
##                    forecast.solar.day.ahead 
##                                           0 
##             forecast.wind.onshore.day.ahead 
##                                           0 
##                         total.load.forecast 
##                                           0 
##                             price.day.ahead 
##                                           0 
##                                price.actual 
##                                           0

16 out of 22 of the remaining variables each have a maximum of 36 missing values. Looking at where these occur, we see that most missing values occur across variables. Specifically, 90% of the missing values occur over only 18 rows. The remaining 10% are spread across 28 rows.

# Rows where data is missing
df[!complete.cases(df),]
# Number of NAs per row
missing_per_row <- rowSums(is.na(df[!complete.cases(df),]))
missing_per_row
##   100   109   110   111   112   113   114   452   453   644   662  2529  2709 
##    14    14    14    14    14    14    14    14    14    14    14    14    14 
##  3969  6587  8050 11237 12673 13342 13392 15983 16613 30897 
##    14    14    14     1     1    13     1     1     1    14
sum(missing_per_row > 1)
## [1] 18
sum(missing_per_row[missing_per_row > 1])/sum(missing_per_row)
## [1] 0.9804688

We can visualise where in the series the missing values occur using the following function:

ggplot_na_distribution(df$generation.biomass)

However, this only allows us to look at one variable at a time. There are other functions in the imputeTS package which can be useful for investigating missingness in a (univariate) time series.

We also find three rows where there are anomalous zero values for every generation source apart from Gas. These look as if they should be missing rather than a true 0 value. I’ve also found two other rows where there is a strange drop off in electricity generation - these were found by calculating the total power for each row, as shown below.

df %>% filter_by_time(time, '2017-11-12 20:00:00', '2017-11-14 18:30:00')
total_power <- df %>% 
  select(c(1, starts_with("generation"))) %>%
  pivot_longer(cols = -time, names_to = "generation_source", values_to = 'value') %>%
  #group_by(generation_source) %>%
  summarise_by_time(time, .by = 'hour', total_power = sum(value)) %>%
  arrange(total_power)

total_power
boxplot(total_power$total_power, main = 'Box plot of hourly total power')

We have decided to interpolate the missing values using linear interpolation. The only places where this might be an issue is where are two gaps of length 6 and 8. These will be visualised below.

# Interpolate missing values using linear interpolation
df_clean <- df %>% mutate(across(generation.biomass:price.actual, .fns = na_interpolation, option = 'linear'))

# Verifying missing values have been filled
df_clean[!complete.cases(df_clean),]

Visualising interpolation

# Creating a 'missing' flag to indicate imputation
df_clean$interp_biomass <- as.integer(is.na(df$generation.biomass))

ggplot(data = df_clean[90:150,]) + geom_line(aes(x=time, y = generation.biomass, colour = interp_biomass), show.legend = FALSE) + scale_color_gradient(low="black", high="red") + labs(title = "Example of linear interpolation (in red) applied to the dataset")

What are we left with in df_clean?

35,000 obs across 20 variables, hourly from start 2015 to end 2018:

  • Energy generation from 14 different sources

  • Forecast onshore wind and solar, one day ahead

  • Forecast and actual total electricity load

  • Forecast and actual electricity price

And in the other dataset, not yet loaded: hourly weather data over the same period for a number of Spanish cities

Investigating generation methods

gen_data <- df_clean %>% 
  select(c(1, starts_with("generation")))

# Giving vars simpler names
names(gen_data) <- c("time",
                     "Biomass",
                    "Lignite",
                    "Gas",
                    "Hard Coal",
                    "Oil",
                    "Hydro Pumped",
                    "Hydro River",
                    "Hydro Reservoir",
                    "Nuclear",
                    "Other",
                    "Other Renewable",
                    "Solar",
                    "Waste",
                    "Wind") #Renaming from Wind Onshore as no other wind var
# Converting to long form, grouping
long_gen_data <- gen_data  %>% 
  pivot_longer(cols = -time, names_to = "generation_source") %>%
  group_by(generation_source)

head(long_gen_data)

Overall statistics

overall_stats <- long_gen_data %>%
  summarise(sum = sum(value, na.rm = TRUE),
            mean = mean(value, na.rm = TRUE),
            sd = sd(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE)) %>%
  mutate(generation_source= fct_reorder(generation_source, desc(sum)))

sources_by_sum <- arrange(overall_stats, desc(sum))
sources_by_sum
ggplot(data = overall_stats) + geom_col(aes(x = generation_source, y = sum)) + theme(axis.text.x = element_text(angle=90)) + labs(title = "Total electricity generated by source")

sources_by_sum %>% pivot_longer(cols = c(sum, mean, sd, max), names_to = "stat", 
                                values_to = "value") %>%
  filter(stat != "sum") %>%
  ggplot() + geom_col(aes(x = generation_source, y = value, fill = stat), position = "dodge") + theme(axis.text.x = element_text(angle=90)) + labs(title = "Other overall hourly statistics of each source")

These plots show that nuclear generation has produced the most electricity over the time series, and has the highest mean hourly value. Nuclear is also a very steady generation method, having the smallest standard deviation out of the top six generation sources. Gas has the second highest total and mean generation, and the highest maximum value in an hour. Wind, the third largest contributor, is the most variable generation source, and has the second highest maximum hourly value.

Using this information to re-factor the generation_source variable, so that plots will order the sources from most to least important.

long_gen_data$generation_source <- factor(long_gen_data$generation_source, levels =  sources_by_sum$generation_source)

Smoothing the data on a daily scale

gen_data_smoothed <- long_gen_data %>% 
  summarise_by_time(time, .by = 'week', adjusted = mean(value)) 

head(gen_data_smoothed)

Individual time series plots

Looking at each series over time in isolation, with quarterly smoothing (This function is nice as it includes an interactive feature)

gen_data_smoothed %>% 
  filter(generation_source %in% sources_by_sum$generation_source[1:6]) %>% 
  plot_time_series(time, adjusted, .facet_ncol = 3,
                   .smooth_period = "12 months", .facet_scales='fixed',
                   .title= "Weekly averages for six biggest generation sources + 12-month smoothing")
gen_data_smoothed %>% 
  filter(generation_source %in% sources_by_sum$generation_source[7:14]) %>% 
  plot_time_series(time, adjusted, .facet_ncol = 3,
                   .smooth_period = "12 months", .facet_scales='fixed',
                   .title= "Weekly averages for six smallest generation sources + 12-month smoothing")

These plots emphasise the relatively steady week-to-week electricity generation from nuclear over the time period, compared with the high variability in wind and gas (note that each row has the same y-axis). Solar shows a strong seasonal pattern within each year, peaking in July (summer) and reaching a low in December (winter) each year. All hydro sources appears to show seasonality on a two-year scale.

In terms of long-term trends, biomass, oil, and “other” all show a similar decrease in production at the start of 2016, followed by a steady period until the end of the series. Meanwhile, Waste and Other Renewable show a similar increasing trend over the four year period. Gas increases slightly, whereas Hard Coal decreased slightly. Nuclear, Wind, Solar, Lignite, and all Hydro sources are steady.

Stacked area plots

long_gen_data %>% 
  summarise_by_time(time, .by = '3 months', adjusted = mean(value)) %>% 
  ggplot() + 
  geom_area(aes(x = time, y = adjusted, fill = generation_source), stat = "identity")

# Proportions
gen_data_props <- cbind(gen_data[1], prop.table(as.matrix(gen_data[-1]), margin = 1))

long_gen_data_props <- gen_data_props %>% 
  pivot_longer(cols = -time, names_to = "generation_source") 

# Ordering sources by importance
long_gen_data_props$generation_source <- factor(long_gen_data_props$generation_source, levels =  sources_by_sum$generation_source)

long_gen_data_props %>%
  group_by(generation_source) %>% 
  summarise_by_time(time, .by = '3 months', adjusted = mean(value)) %>% 
  ggplot() + 
  geom_area(aes(x = time, y = adjusted, fill = generation_source), stat = "identity") +
  theme_minimal()

Correlations

# Correlation plot. Sample size for computational efficiency.
df_corr <- gen_data[1:10000,2:ncol(gen_data)]

corrmatrix <- df_corr %>% cor()
ggcorrplot(corrmatrix,
            hc.order = TRUE,
            type = "lower",
            lab = FALSE)

# Extract individual correlations using the following:
corrmatrix['Hard Coal', 'Hydro Pumped']
## [1] -0.5379619
corrmatrix['Hard Coal', 'Wind']
## [1] -0.5821862
corrmatrix['Hard Coal', 'Lignite']
## [1] 0.8355603
corrmatrix['Hydro River', 'Hydro Reservoir']
## [1] 0.6465254

The most positive correlation is between Hard Coal and Lignite (0.83), followed by Hydro River and Hydro Reservoir (0.65). This makes sense given the similarity of these sources. The most negative correlation is between Hard Coal and Wind (-0.58), followed by Hard Coal and Hydro Pumped (-0.54). These relationships could reflect the variability in the generation by the renewable sources (Wind and Hydro, see figures …), and the need to compensate with fossil fuels when generation is low. These relationships are further examined in the plots below.

long_gen_data[1:10000,] %>% filter(generation_source %in% c('Wind', 'Hard Coal','Lignite')) %>% ungroup() %>% plot_time_series(time, value, .color_var = generation_source, .smooth = 0)
long_gen_data[1:10000,] %>% filter(generation_source %in% c('Hydro Reservoir', 'Hydro River')) %>% ungroup() %>% plot_time_series(time, value, .color_var = generation_source, .smooth = 0)

Seasonality

# Create time variables
long_gen_data$year <- year(long_gen_data$time)
long_gen_data$month <- month(long_gen_data$time)
long_gen_data$day <- day(long_gen_data$time)
long_gen_data$hour <- hour(long_gen_data$time)

long_gen_data_props$year <- year(long_gen_data_props$time)
long_gen_data_props$month <- month(long_gen_data_props$time)
long_gen_data_props$day <- day(long_gen_data_props$time)
long_gen_data_props$hour <- hour(long_gen_data_props$time)

Seasonality within a day

hourly_generation_plot_by_year <- long_gen_data %>%
  ggplot(aes(x = hour, y = value, fill = generation_source)) +
  geom_bar(stat = "identity")

hourly_generation_plot_by_year

# Proportions
hourly_generation_plot_by_year_props <- long_gen_data_props %>%
  ggplot(aes(x = hour, y = value, fill = generation_source)) +
  geom_bar(stat = "identity")

hourly_generation_plot_by_year_props

Seasonality within a month

daily_generation_plot_by_year <- long_gen_data %>%
  ggplot(aes(x = day, y = value, fill = generation_source)) +
  geom_bar(stat = "identity") +
  facet_wrap(~year) + theme(legend.position = "none") 

daily_generation_plot_by_year

# Note: Similar problem to raw values
daily_generation_plot_by_year_props <- long_gen_data_props %>%
  ggplot(aes(x = day, y = value, fill = generation_source)) +
  geom_bar(stat = "identity") +
  facet_wrap(~year) + theme(legend.position = "none") 

daily_generation_plot_by_year_props

Seasonality within a year

monthly_generation_plot <- long_gen_data %>%
  ggplot(aes(x = month, y = value, fill = generation_source)) +
  geom_bar(stat = "identity") +
  facet_wrap(~year)

monthly_generation_plot

# Note: Shorter bars for shorter months as above
monthly_generation_plot_props <- long_gen_data_props %>%
  ggplot(aes(x = month, y = value, fill = generation_source)) +
  geom_bar(stat = "identity") +
  facet_wrap(~year)

monthly_generation_plot_props

Variation in maximum and minimum load

Daily averages
datain <- long_gen_data %>% group_by(generation_source, year, day, month) %>% summarise(across(c(value), c(max, min)))

datain <- datain %>%
  mutate(date = make_date(year, month, day))
p <- list()
names <- unique(datain$generation_source)
for(i in 1:length(names)){
  datain2 <- filter(datain, generation_source == names[i])
  p[[i]] <- datain2 %>% ggplot()+geom_line(aes(x = date, y = value_1, color = "max"))+geom_line(aes(x = date, y = value_2, color = "min"))+theme(legend.position="bottom")+ggtitle(names[i])+
  stat_smooth(formula = y ~ x,method = "loess", col = "red",aes(x = date, y = value_1))+
  stat_smooth(formula = y ~ x, method = "loess", col = "blue",aes(x = date, y = value_2))

}

do.call(grid.arrange,c(p,ncol=2))

  • mostly parallel suggesting max and minimum load on the grid for each type remains fairly constant despite overall changes in level.
  • Notable exemptions to this include oil where max has decreased in proportion. Hydro Pumped where max is decreasing, solar where max is decreasing in comparison.
  • Overall decreasing trends: hard coal, biomass, oil, wind onshore, other, lignite
  • Overall increasing trends: Gas, waste, other renewable, hydro river step changes seen in:
  • hard coal, mid 2016 - mid 2017, increase
  • gas, mid 2016 - mid 2017, increase
  • Hydro Reservoir, mid 2016 - mid 2017, decrease
  • Hydro river, mid 2016 - mid 2017, decrease
  • lignite, mid 2016 - mid 2017, increase
  • oil, start 2016, drop
  • other renewable, 2016 start, 2017 start, increase *other, 2016 start decrease
Daily average violin plots
q <- list()

for(i in 1:length(names)){
  datain2 <- filter(datain, generation_source == names[i])
  q[[i]] <-  ggplot(datain2 ) + geom_violin(aes(x = 1, y = value_2))+ geom_violin(aes(x = 2, y = value_1)) + geom_boxplot(aes(x = 1, y = value_2), width=0.1)+ geom_boxplot(aes(x = 2, y = value_1), width=0.1)+ggtitle(names[i])}

do.call(grid.arrange,c(q,ncol=2))

Looking at violin plots for max and min:

  • clear preferred modes eg nuclear three defined levels in both
  • three overall trends - show predominant variation in max / min or equal in both
Monthly averages
#long_gen_data[is.na(long_gen_data)] <- mean(long_gen_data$value)

datain <- long_gen_data %>% group_by(generation_source, year, month) %>% summarise(across(c(value), c(max, min)))
## `summarise()` has grouped output by 'generation_source', 'year'. You can override using the `.groups` argument.
datain2 <- datain %>%
  mutate(date = make_date(year, month))

names <- unique(datain2$generation_source)
for(i in 1:length(names)){
  datain3 <- filter(datain2, generation_source == names[i])
  p[[i]] <- datain3 %>% ggplot()+geom_line(aes(x = date, y = value_1, color = "max"))+geom_line(aes(x = date, y = value_2, color = "min"))+theme(legend.position="bottom")+ggtitle(names[i])
}

do.call(grid.arrange,c(p,ncol=2))